# -*-Python-*-
# Bilion word language modeling benchmark from t2t - pretokenized

import mesh_tensorflow.transformer.t2t_vocabulary

# Dataset
utils.run.train_dataset_fn = @mesh_tensorflow.transformer.dataset.pretokenized_t2t_dataset
dataset.pretokenized_t2t_dataset.dataset_name = "languagemodel_lm1b32k"
dataset.pretokenized_t2t_dataset.text2self = True

# Vocabulary
utils.run.vocabulary = @mesh_tensorflow.transformer.t2t_vocabulary.get_t2t_vocabulary()
t2t_vocabulary.get_t2t_vocabulary.vocabulary_filename = "vocab.languagemodel_lm1b32k.32768.subwords"

# Model
run.model_type = "lm"

# No custom ops required:
dataset.pack_dataset.use_custom_ops = False
